# Load packages
if (!require("pacman")) install.packages("pacman")
pacman::p_load(
  data.table, janitor, magrittr, fixest, 
  broom, tidyverse, tidylog, readxl, sf, maps, tigris
)
options("tidylog.display" = NULL)

unemployment = read_xlsx("raw/BLS/laucnty97.xlsx", skip = 5, col_names = F) %>% 
  select(2,3,7,8,9) %>% 
  set_colnames(c("state_fips", "county_fips", "labor_force", "employed", "unemployed")) %>% 
  mutate(fips = paste0(state_fips, county_fips),
         unemployed_to_employed = unemployed / employed) %>% 
  select(fips, unemployed_to_employed)

# mismatches between BEA AND BLS
county_income_industry <- fread("raw/BEA//CAINC5S__ALL_AREAS_1969_2000.csv", fill = TRUE) %>%
  as_tibble() %>%
  clean_names() %>%
  filter(geo_fips != "00000" & industry_classification != "..." & industry_classification != "")

counties <- counties(cb = TRUE, year = 2020) %>%
  st_as_sf() %>%
  dplyr::rename(fips = GEOID) %>% 
  mutate(STATEFP = as.numeric(STATEFP)) %>% 
  filter(STATEFP != 15 & STATEFP != 2 & STATEFP <= 56 & STATEFP != 11)  %>% 
  st_transform(5070)

counties_sdf <- st_as_sf(maps::map("county", plot = FALSE, fill = TRUE)) %>% 
  rename(polyname = ID) %>% 
  full_join(maps::county.fips) %>% 
  mutate(fips = ifelse(fips == 46113, 46102, fips)) %>% 
  mutate(fips = ifelse(polyname == "florida,okaloosa", 12091, fips)) %>% 
  mutate(fips = ifelse(polyname == "virginia,accomack", 51001, fips)) %>% 
  mutate(fips = ifelse(polyname == "texas,galveston", 48167, fips)) %>% 
  mutate(fips = ifelse(polyname == "louisiana,st martin", 22099, fips)) %>% 
  mutate(fips = ifelse(polyname == "north carolina,currituck", 37053, fips)) %>% 
  mutate(fips = ifelse(polyname == "washington,pierce", 53053, fips)) %>% 
  mutate(fips = ifelse(polyname == "washington,san juan", 53055, fips)) %>% 
  mutate(fips = str_pad(fips, 5, "left", pad = "0")) %>% 
  st_transform(5070)

va_bea = county_income_industry %>% 
  distinct(geo_fips, .keep_all = TRUE) %>% 
  rename(fips = geo_fips) %>% 
  filter(as.numeric(fips) > 51000 & as.numeric(fips) < 52000)
va_maps = counties_sdf %>% 
  filter(as.numeric(fips) > 51000 & as.numeric(fips) < 52000)

mismatch = full_join(va_bea, va_maps) %>% 
  dplyr::select(fips, polyname, geo_name) %>% 
  filter(is.na(polyname) | is.na(geo_name)) %>% 
  filter(!str_detect(geo_name, "Independent") | is.na(geo_name)) %>% 
  rename(geo_fips = fips)

mismatch$polyname[is.na(mismatch$polyname)] = mismatch$polyname[!is.na(mismatch$polyname)]
mismatch$geo_name[is.na(mismatch$geo_name)] = mismatch$polyname[!is.na(mismatch$geo_name)]

va_cross = va_maps %>% left_join(mismatch) %>% 
  mutate(geo_fips = ifelse(is.na(geo_fips), fips, geo_fips)) %>% 
  mutate(fips_maps = fips, fips_bea = geo_fips) %>% 
  as_tibble() %>% 
  dplyr::select(-geom) %>% 
  dplyr::select(polyname, fips_maps, fips_bea) 

unemployment = unemployment %>% 
  mutate(fips_maps = fips) %>% 
  left_join(va_cross) %>% 
  mutate(fips = ifelse(!is.na(fips_bea), fips_bea, fips)) %>% 
  distinct(fips, .keep_all = TRUE) %>% 
  dplyr::select(-polyname, -fips_maps, -fips_bea)

# employment
county_employment = fread("raw/BEA/CAEMP25S__ALL_AREAS_1969_2000.csv", fill = TRUE) %>%
    as_tibble() %>%
    clean_names() %>%
    filter(geo_fips != "00000" & industry_classification != "..." & industry_classification != "") %>%
    select(-line_code, -description, -unit, -region, -table_name) %>%
    pivot_longer(
        cols = starts_with("x"),
        names_prefix = "x",
        names_to = "year",
        values_to = "county_employment"
    ) %>%
    mutate(year = as.numeric(year)) %>%
    pivot_wider(
        names_from = "industry_classification",
        names_prefix = "industry_employment_",
        values_from = "county_employment"
    ) %>%
    clean_names() %>% 
    mutate(across(industry_employment_01_02:industry_employment_i, ~ as.numeric(.x)))

county_employment[is.na(county_employment)] = 0

county_employment = county_employment %>% 
    mutate(employment_m = industry_employment_d,
           employment_o = industry_employment_01_02 + industry_employment_07_09 + industry_employment_b  + 
               industry_employment_c  + industry_employment_e  + industry_employment_f + 
               industry_employment_g + industry_employment_h  + industry_employment_i) %>% 
    select(geo_fips, year, employment_m, employment_o) %>% 
    filter(substr(geo_fips, 4, 5) != "00") %>% 
    filter(year >= 1990 & year <= 1997) %>% 
    pivot_longer(
        cols = 3:4,
        names_to = "industry",
        names_prefix = "employment_",
        values_to = "employment"
    )

## industry-specific county-level income 1969-2000 (SIC codes)
county_income_industry = fread("raw/BEA//CAINC5S__ALL_AREAS_1969_2000.csv", fill = TRUE) %>%
    as_tibble() %>%
    clean_names() %>%
    filter(geo_fips != "00000" & industry_classification != "..." & industry_classification != "") %>%
    select(-line_code, -description, -unit, -region, -table_name) %>%
    pivot_longer(
        cols = starts_with("x"),
        names_prefix = "x",
        names_to = "year",
        values_to = "county_industry_income"
    ) %>%
    mutate(year = as.numeric(year)) %>%
    pivot_wider(
        names_from = "industry_classification",
        names_prefix = "industry_income_",
        values_from = "county_industry_income"
    ) %>%
    clean_names() %>% 
    mutate(across(4:84, ~ as.numeric(.x)))

county_income_industry[is.na(county_income_industry)] = 0

county_income_industry = county_income_industry %>% 
    mutate(income_m = industry_income_d,
           income_o = industry_income_01_02 + industry_income_07_09 + industry_income_b  + 
               industry_income_c  + industry_income_e  + industry_income_f + 
               industry_income_g + industry_income_h  + industry_income_i) %>% 
    select(geo_fips, year, income_m, income_o) %>% 
    filter(substr(geo_fips, 4, 5) != "00") %>% 
    filter(year >= 1990 & year <= 1997) %>% 
    pivot_longer(
        cols = 3:4,
        names_to = "industry",
        names_prefix = "income_",
        values_to = "income"
    )

county_df = inner_join(county_income_industry, county_employment) %>% 
    group_by(geo_fips, year) %>% 
    mutate(inc_test = prod(income),
           emp_test = prod(employment)) %>% 
    ungroup() %>% 
    filter(inc_test > 0 & emp_test > 0) %>% 
    rename(fips = geo_fips) %>% 
    mutate(wage = income/employment) %>% 
    arrange(fips, industry, year) %>% 
    ungroup() %>% 
    filter(year == 1997)

total_employment = county_df %>% 
  group_by(fips) %>% 
  summarise(employment = sum(employment)) %>% 
  ungroup()

total_employment = total_employment %>% 
  inner_join(unemployment) %>% 
  mutate(employment = unemployed_to_employed * employment,
         year = 1997, 
         industry = "u") %>% 
  select(fips, employment, year, industry)

fips_int = intersect(county_df$fips, total_employment$fips)

county_df = bind_rows(county_df, total_employment) %>% 
  arrange(fips) %>% 
  filter(fips %in% fips_int)
  
fwrite(county_df, "data/payroll_cf_temp.csv")